1 Loading data and packages

library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(factoextra)
library(UpSetR)
library(ComplexHeatmap)
library(circlize)
library(SingleCellExperiment)
library(org.Hs.eg.db)
library(clusterProfiler)
library(DOSE)
library(tibble)
library(BiocParallel)
library(patchwork)
library(Biostrings)

Gene names/synonyms required for databases cleaning

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
                       "external_synonym", "gene_biotype",
                       "chromosome_name", "band", "start_position", "end_position",
                       "strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

ensembl_gene_synonym <- ensembl_gene_synonym %>%
  mutate(external_synonym = sub(pattern = "ORF", external_synonym, 
                                replacement = "orf"))

attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
GTEX_data <- CTdata::GTEX_data()
## Error in readRDS(file) : error reading from connection
normal_tissues_multimapping_data <- CTdata::normal_tissues_multimapping_data()
CCLE_data <- CTdata::CCLE_data()
TCGA_TPM <- CTdata::TCGA_TPM()
DAC_treated_cells_multimapping <- CTdata::DAC_treated_cells_multimapping()
load("../CTdata/eh_data/testis_sce.rda") 
load("../CTdata/eh_data/scRNAseq_HPA.rda")
load("../CTdata/eh_data/CT_mean_methylation_in_tissues.rda")

Common figures parameters

legends_param <- list(
  labels_gp = gpar(col = "black", fontsize = 6),
  title_gp = gpar(col = "black", fontsize = 6),
  row_names_gp = gpar(fontsize = 4),
  annotation_name_side = "left")

legend_colors <- c("#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
                   "#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
                   "#9E0142")
chr_colors <- c("X-linked" = "deeppink", "Not X" = "royalblue1")

meth_colors <- c("Methylation" = "lightgreen", "Not methylation" = "gray")

2 Database comparison

2.1 Lists cleaning

CT lists from other databases have been checked (using GTEx and our GTEx_expression() funtion and GeneCards) in order to remove duplicated gene names or deprecated ones and allow comparison between databases.

2.1.1 CTdatabase

Online list copied in a csv file, several lists exist so we combined them.

We checked gene names that were a concetanation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.

CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";", 
                         escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
                          "CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis, 
                        by = c("Gene_Name" = "Gene_Symbol"))


CTdatabase_single <- CTdatabase %>%
  mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
  mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))


CTdatabase_official_names <- 
  unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, 
                       external_gene_name)) %>%
  filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)
CTdatabase_synonym <- 
  ensembl_gene_synonym %>%
  filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <- 
  rbind(CTdatabase_official_names, CTdatabase_synonym) %>% 
  left_join(CTdatabase_single)


duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
  filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
  filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
  pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"


CTdatabase_cleaned <- ensembl_gene_synonym %>%
  mutate(Gene_Name = external_synonym) %>%
  filter(external_synonym == "CXorf61") %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61", 
                    c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
  filter(external_gene_name == "CCNA1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "GOLGA6L2") %>%
  filter(ensembl_gene_id == "ENSG00000174450") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LYPD6B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT62") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT75") %>%
  filter(ensembl_gene_id == "ENSG00000291155") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LINC01192") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "TSPY1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "SSX2B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)

2.1.2 Jamin’s list

Excel file coming from supplemental data.

Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"

2.1.3 Wang’s CTatlas

Excel file coming from supplemental data.

Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx", 
    sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"

Wang_CT <- ensembl_gene_names %>% 
  filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
  right_join(Wang_CT)

Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"
Wang_CT[Wang_CT$external_gene_name == "", "external_gene_name"] <- "RNASE11"
Wang_CT[Wang_CT$external_gene_name == "CHCT1", "external_gene_name"] <- "C17orf64"
Wang_CT[Wang_CT$external_gene_name == "PRSS40A", "external_gene_name"] <- "TISP43"
Wang_CT[Wang_CT$external_gene_name == "TEX56P", "external_gene_name"] <- "C6orf201"
Wang_CT[Wang_CT$external_gene_name == "SLC25A51P4", "external_gene_name"] <- "RP11-113D6.10"
Wang_CT[Wang_CT$external_gene_name == "TCP10L3", "external_gene_name"] <- "TCP10"
Wang_CT[Wang_CT$external_gene_name == "SCAND3", "external_gene_name"] <- "ZBED9"

2.1.4 Carter’s list

Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]

Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"

2.2 CTexploreR data for selection pipeline

To characterise the differences between our database and other, we need the category we created in CTexploreR.

Hereunder is what we used for our selection pipeline (coming from make_CT_list.R in CTdata), mainly how we created the data.

# GTEX data with the tissue specificity category determined
all_genes <- as_tibble(rowData(GTEX_data), rownames = "ensembl_gene_id")
all_genes$TPM_testis <- assay(GTEX_data)[, "Testis"]

# Add multimapping_analysis assessing testis-specificity of genes "lowly_expressed"

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(normal_tissues_multimapping_data), 
                      rownames = "ensembl_gene_id"))

# Summarise both specificity

all_genes <- all_genes %>%
  mutate(testis_specificity = case_when(
    GTEX_category == "testis_specific" |
      multimapping_analysis == "testis_specific" ~ "testis_specific",
    GTEX_category == "testis_preferential" ~ "testis_preferential"))

# Add testis scRNA seq highest expressed cell type

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(testis_sce)) %>%
              dplyr::select(external_gene_name, testis_cell_type))

# Add higher in somatic scRNAseq data of normal tissues from the Human Protein Atlas 

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(scRNAseq_HPA), rownames = "ensembl_gene_id") %>%
              dplyr::select(ensembl_gene_id, external_gene_name,
                            Higher_in_somatic_cell_type))

# CCLE database analysis added

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(CCLE_data), rownames = "ensembl_gene_id"))
all_genes[is.na(all_genes$CCLE_category), "CCLE_category"] <- "not_in_CCLE"


# TCGA database analysis added

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(TCGA_TPM), rownames = "ensembl_gene_id") %>%
              dplyr::select(ensembl_gene_id, external_gene_name, percent_pos_tum,
                            percent_neg_tum, max_TPM_in_TCGA, TCGA_category))

all_genes[all_genes$lowly_expressed_in_GTEX == TRUE &
            all_genes$multimapping_analysis == "testis_specific",
          "TCGA_category"] <- "multimapping_issue"

# Add DAC analysis
induced <- as_tibble(rowData(DAC_treated_cells_multimapping), 
                     rownames = "ensembl_gene_id") %>%
  filter(induced == TRUE) %>%
  pull(external_gene_name)

all_genes <- all_genes %>%
  mutate(DAC_induced = case_when(external_gene_name %in% induced ~ TRUE,
                                 !external_gene_name %in% induced ~ FALSE))

# Add the most biologically relevant transcript (canonical and coordinates)

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id",
                       "external_gene_name",
                       "ensembl_transcript_id",
                       "external_transcript_name",
                       "chromosome_name",
                       "strand",
                       "transcript_start",
                       "transcript_end",
                       "transcription_start_site",
                       "transcript_length",
                       "transcript_biotype",
                       "transcript_is_canonical")
transcripts_infos <- as_tibble(biomaRt::getBM(attributes = attributes_vector,
                                              mart = ensembl))
canonical_transcripts <- transcripts_infos %>%
  filter(transcript_is_canonical == 1) %>%
  filter(chromosome_name %in% c(1:22, "X", "Y", "MT")) %>%
  filter(transcript_biotype == "protein_coding" | transcript_biotype == "lncRNA")
all_genes <- all_genes %>%
  left_join(canonical_transcripts %>%
              dplyr::select(ensembl_gene_id,
                            external_transcript_name, ensembl_transcript_id,
                            chromosome_name, strand, transcript_start,
                            transcript_end, transcription_start_site,
                            transcript_length, transcript_biotype))

From there, we filtered based on the testis_specificity (“testis_specific” or “testis_preferential”), testis_cell_type (not “Macrophage”, “Endothelial”, “Myoid”, “Sertoli”, “Leydig”), Higher_in_somatic_cell_type (TRUE), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then the most relevant transcript was validated manually (and check that there is a transcript activated in testis and the same is activated in tumor).

2.3 CTexploreR VS CTdatabase

CTdatabase_ours <- Venn(list(CTdatabase = CTdatabase_cleaned$external_gene_name,
                             CTexploreR = CT_genes$external_gene_name))
plot(CTdatabase_ours)

2.4 CTexploreR VS omics databases

core_ours <- Venn(list(Jamin = Jamin_core_CT$Gene, 
                       CTexploreR = CT_genes$external_gene_name))

Wang_ours <- Venn(list(Wang = Wang_CT$external_gene_name, 
                       CTexploreR = CT_genes$external_gene_name))

Carter_ours <- Venn(list(Carter_CT = Carter_CT$Gene_Name, 
                         CTexploreR = CT_genes$external_gene_name))

gene_list <- list(Jamin = Jamin_core_CT$Gene, 
                  CTdatabase = CTdatabase_cleaned$external_gene_name, 
                  CTatlas = Wang_CT$external_gene_name, 
                  CTexploreR = CT_genes$external_gene_name, 
                  Carter = Carter_CT$Gene_Name)

upset_omics <- fromList(gene_list[-2])
upset(upset_omics)

2.5 Characterisation of differences with all databases

common <- unique(c(core_ours@IntersectionSets[["11"]], 
                   CTdatabase_ours@IntersectionSets[["11"]], 
                   Wang_ours@IntersectionSets[["11"]], 
                   Carter_ours@IntersectionSets[["11"]]))

length(common)
## [1] 212
length(common)/dim(CT_genes)[1] * 100
## [1] 71.14094
lost_list <- unique(c(core_ours@IntersectionSets[["10"]], 
                 CTdatabase_ours@IntersectionSets[["10"]],
                 Wang_ours@IntersectionSets[["10"]],
                 Carter_ours@IntersectionSets[["10"]]))

lost <- all_genes %>%
  filter(external_gene_name %in% lost_list)

not_specific <- filter(lost, is.na(testis_specificity))

somatic_testis <- filter(lost, testis_cell_type %in% c( "Macrophage", 
                                                        "Endothelial", "Myoid",
                                                        "Sertoli", "Leydig"))

somatic_tissue <- filter(lost, Higher_in_somatic_cell_type == TRUE)

not_TCGA_activated <- filter(lost, TCGA_category != "activated" & 
                               TCGA_category != "multimapping_issue")

not_CCLE_activated <- filter(lost, CCLE_category  != "activated")

transcript_prob <- lost %>% 
  filter(testis_specificity == "testis_specific" |
           testis_specificity == "testis_preferential") %>%
  filter(TCGA_category == "activated" | TCGA_category == "multimapping_issue") %>% 
  filter(CCLE_category == "activated") %>% 
  dim()

We have lost 979 genes in total. Among them, 58.0183861% are not considered testis specific, 2.8600613% are expressed in testis somatic cells, 11.13381% are expressed in somatic cells, 62.206333% are not activated in TCGA samples, 66.4964249% are not activated in CCLE cell lines and 1.9407559% is lost due to transcripts problems.

3 CT genes characteristics

table(CT_genes$testis_specificity)
## 
## testis_preferential     testis_specific 
##                  92                 206
table(CT_genes$transcript_biotype)
## 
##                  lncRNA nonsense_mediated_decay          protein_coding 
##                      48                       3                     247

Most genes are testis specific (69.1275168%). Most genes are mainly protein coding genes (82.885906%).

3.1 X chromosome and regulated by methylation

In CTexploreR, genes have been characterised as regulated by methylation or not.

table(CT_genes$X_linked)
## 
## FALSE  TRUE 
##   205    93
table(CT_genes$regulated_by_methylation)
## 
## FALSE  TRUE 
##   138   160
table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE   128   77
##   TRUE     10   83

Genes are enriched on the X chromosome (31.2080537%). Also, 160 genes have been flagged as regulated by methylation (53.6912752%). It is interesting to study the link between these two characteristics

On the chromosome X, there is a clear enrichment of CT genes regulated by methylation (83/93 chrX genes or 83/160 genes regulated by methylation).

Let’s check that with a statistical test, I here want to see if, on there is an enrichment of genes regulated by methylation on the X chromosome. I need to do a Pearson Chi square test (to know if observed proportion differ from expected proportion). It is a statistical method to determine if two categorical variables have a significant correlation between them. I can directly put a matrix (my table) in the function

chisq.test(table(CT_genes$X_linked, CT_genes$regulated_by_methylation))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
## X-squared = 66.676, df = 1, p-value = 3.2e-16
CT_genes$chr_factor <- factor(CT_genes$chr,
                       levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
                                  "10", "11", "12", "13", "14", "15", "16", "17", 
                                  "18", "19", "20", "21", "22", "X", "Y"))

CT_genes %>% 
  mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
                                           "Regulated by methylation",
                                           "Not regulated by methylation")) %>% 
  ggplot(aes(x = chr_factor, fill = testis_specificity, color = X_linked)) +
  geom_bar(stat = 'count') +
  scale_fill_manual(values = c("goldenrod1", "mediumpurple1")) +
  scale_color_manual(values = c("royalblue1", "deeppink"), guide = "none") +
  facet_wrap(~ regulated_by_methylation, ncol = 2) +
  theme_bw() +
  xlab("Chromosome") +
  ylab("Number of genes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "top",
        axis.title = element_text(size = 10, color = "gray10"))

There is indeed a significative link between regulation by methylation and being on the X chromosome. There is thus an enrichment of CT genes regulated by methylation on the X chromosome (and inversely).

3.2 Tumour and methylation

Using CTexploreR functions, we can explore all CT genes or focus on some potential targets.

3.2.1 All CT

For these heatmaps, the code comes from the function but has been copies to add some annotations.

chr_mat <- as.matrix(CT_genes$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes$external_gene_name
row_ha_chr <- rowAnnotation(chr_factor = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr_factor = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")



regulation_mat <- as.matrix(CT_genes$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes$regulated_by_methylation, CT_genes$X_linked)
database <- TCGA_TPM
database$tumor <- sub(pattern = 'TCGA-', x = database$project_id, '')
database$type <- "Tumor"
database$type[database$shortLetterCode == "NT"] <- "Peritumoral"
database <- database[, order(database$tumor, database$type)]

genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]

column_ha_type <- HeatmapAnnotation(
  Type = database$type,
  border = TRUE,
  col = list(Type = c("Tumor" = "salmon",
                      "Peritumoral" = "mediumseagreen")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

column_ha_tumor <- HeatmapAnnotation(
  Tumor = database$tumor,
  border = TRUE,
  col = list(Tumor = c("BRCA" = "midnightblue", "COAD" = "darkorchid2",
                 "ESCA" = "gold", "HNSC" = "deeppink2",
                 "LUAD" = "seagreen", "LUSC" = "seagreen3",
                 "SKCM" = "red3")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

split_by <- factor(database$tumor)
col_annot <- column_ha_tumor

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name


Heatmap(mat[, , drop = FALSE],
        name = "logTPM",
        column_title = paste0("Expression in TCGA samples (all)"),
        column_split = split_by,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                         legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = col_annot,
        left_annotation = left_annot)

database <- CCLE_data
database$type <- tolower(database$type)
genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name

df_col <- data.frame("cell_line" = colData(database)$cell_line_name,
                     "type" = as.factor(colData(database)$type))
rownames(df_col) <- rownames(colData(database))
df_col <- df_col[order(df_col$type), ]

column_ha_type <- HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param,
  col = list(type = c("lung" = "seagreen3", "skin" = "red3",
                 "bile_duct" = "mediumpurple1", "bladder" = "mistyrose2",
                 "colorectal" = "plum", "lymphoma" = "steelblue1",
                 "uterine" = "darkorange4", "myeloma" = "turquoise3", 
                 "kidney" = "thistle4",
                 "pancreatic" = "darkmagenta", "brain" = "palegreen2",
                 "gastric" = "wheat3", "breast" = "midnightblue",
                 "bone" = "sienna1", "head_and_neck" = "deeppink2",
                 "ovarian" = "tan3", "sarcoma" = "lightcoral",
                 "leukemia" = "steelblue4", "esophageal"= "khaki",
                 "neuroblastoma" = "olivedrab1")))



Heatmap(mat[, rownames(df_col), drop = FALSE],
        name = "logTPM",
        column_title = "Gene Expression in tumor cell lines (CCLE)",
        column_split = factor(df_col$type),
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                          legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_row_dend = FALSE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = c(column_ha_type),
        left_annotation = left_annot)

genes <- CT_genes$external_gene_name
database <- CT_mean_methylation_in_tissues[rownames(CT_mean_methylation_in_tissues) %in% genes]

mat <- na.omit(assay(database))
clustering_option <- TRUE

row_ha_chr_meth <- rowAnnotation(chr = chr_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


row_ha_reg_meth <- rowAnnotation(regulation = regulation_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot_meth <- c(row_ha_chr_meth, row_ha_reg_meth, gap = unit(1, "mm"))

split_meth <- data.frame(filter(CT_genes, external_gene_name %in% rownames(mat))$regulated_by_methylation, 
                    filter(CT_genes, external_gene_name %in% rownames(mat))$X_linked)

Heatmap(mat,
        column_title = 'Promoter mean methylation level by tissue',
        name = 'Meth',
        col = colorRamp2(c(1:100),
                         colorRampPalette(c("moccasin","dodgerblue4"))(100)),
        na_col = "gray80",
        cluster_rows = clustering_option,
        cluster_columns = FALSE,
        row_split = split_meth,
        row_title_gp = gpar(fontsize = 0),
        show_row_names = TRUE,
        show_heatmap_legend = TRUE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 3),
        column_names_gp = gpar(fontsize = 8),
        column_names_side = "bottom",
        row_names_side = "right",
        left_annotation = left_annot_meth)

3.2.2 MAGE genes

MAGE_genes <- filter(CT_genes, family == "MAGE")$external_gene_name

TCGA_expression(tumor = "all", 
                genes = MAGE_genes,
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

CCLE_expression(genes = MAGE_genes,
                 type = c("lung", "skin", "bile_duct", "bladder", 
                          "colorectal", "lymphoma", "uterine",
                          "myeloma", "kidney", "pancreatic", "brain", 
                          "gastric", "breast", "bone", "head_and_neck", 
                          "ovarian", "sarcoma", "leukemia", "esophageal",
                          "neuroblastoma"), units = "log_TPM")

normal_tissues_mean_methylation(MAGE_genes, na.omit = TRUE)

3.3 Single Cell analysis : timing of expression

In CTexploreR, we there is single cell data from the testis, we thus can analyse CT genes expression during spermatogenesis.

There is only data for 251 of our 298 CT genes.

NB : SSC, Spermatogonia and early spermatocytes are premeiotic cells. Late spermatocytes (between both meiosis), round spermatid, elongated spermatid and sperm are postmeiotic cells.

genes_avail <- 
  CT_genes$external_gene_name[CT_genes$external_gene_name %in% unique(rownames(testis_sce))]
table(CT_genes$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte     Round_spermatid 
##                  41                  25                  23                  64 
##              Sperm1              Sperm2       Spermatogonia                 SSC 
##                  12                  14                  29                  33
table(CT_genes$testis_cell_type)/length(genes_avail)*100
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte     Round_spermatid 
##           16.334661            9.960159            9.163347           25.498008 
##              Sperm1              Sperm2       Spermatogonia                 SSC 
##            4.780876            5.577689           11.553785           13.147410

41% of genes are mainly expressed pre-meioticly.

germ_cells <- c("SSC", "Spermatogonia", "Early_spermatocyte",
                "Late_spermatocyte","Round_spermatid", "Elongated_spermatid",
                "Sperm1", "Sperm2")
somatic_cells <- c("Macrophage", "Endothelial", "Myoid", "Sertoli", "Leydig")

testis_sce_CT <- testis_sce[genes_avail, ]
  
mat <- SingleCellExperiment::logcounts(testis_sce_CT)

df_col <- data.frame(clusters = colData(testis_sce_CT)$clusters,
                     type = colData(testis_sce_CT)$type,
                     Donor = colData(testis_sce_CT)$Donor)
rownames(df_col) <- colnames(testis_sce_CT)

df_col <- df_col[order(df_col$type),]
df_col$lineage <- "Germ cells"
df_col$lineage[df_col$type %in% somatic_cells] <- "Somatic cells"
    
column_ha_type = HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  col = list(type = c("SSC" = "floralwhite", "Spermatogonia" = "moccasin",
                   "Early_spermatocyte" = "gold", 
                   "Late_spermatocyte" = "orange",
                   "Round_spermatid" = "red2", 
                   "Elongated_spermatid" = "darkred",
                   "Sperm1" = "violet", "Sperm2" = "purple", 
                   "Sertoli" = "gray", 
                   "Leydig" = "cadetblue2", "Myoid" = "springgreen3", 
                   "Macrophage" = "gray10",
                   "Endothelial" = "steelblue")),
  
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)
    
column_ha_lineage = HeatmapAnnotation(
  lineage = df_col$lineage,
  border = TRUE,
  col = list(lineage = c("Germ cells" = "salmon", "Somatic cells" = "cyan4")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

scale_lims <- c(0, quantile(rowMax(mat), 0.75))
top_annot <- c(column_ha_lineage, column_ha_type)

# Until here is what's in the function, hereunder is my addition/change in Heatmap()

CT_genes_avail <- filter(CT_genes, external_gene_name %in% genes_avail)

chr_mat <- as.matrix(CT_genes_avail$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes_avail$external_gene_name
row_ha_chr <- rowAnnotation(chr = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


regulation_mat <- as.matrix(CT_genes_avail$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes_avail$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes_avail$regulated_by_methylation, CT_genes_avail$X_linked)
    
Heatmap(mat[genes_avail, rownames(df_col), drop = FALSE],
        name = "logCounts",
        column_title = "Expression in testis cells (scRNAseq)",
        column_split = df_col$type,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        show_column_names = FALSE,
        show_column_dend = FALSE,
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        cluster_columns = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        col = colorRamp2(seq(scale_lims[1], scale_lims[2], length = 11),                 
                         legend_colors),
        top_annotation = top_annot,
        left_annotation = left_annot,
        heatmap_legend_param = legends_param)

We used these data to determine in which testis cell type each gene is mostly expressed.

CT_genes$main_cell_type_expression <- factor(CT_genes$testis_cell_type,
                                                levels = c("SSC", 
                                               "Spermatogonia", 
                                               "Early_spermatocyte",
                                               "Late_spermatocyte",
                                               "Round_spermatid",
                                               "Elongated_spermatid",
                                               "Sperm1",
                                               "Sperm2",
                                               "Sertoli"))

CT_genes %>% 
  mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
                                           "Regulated by methylation",
                                           "Not regulated by methylation")) %>%
  filter(!is.na(testis_cell_type)) %>%
  filter(main_cell_type_expression != "Sertoli") %>% 
  ggplot(aes(x = chr, fill = main_cell_type_expression,
             color = main_cell_type_expression)) +
  scale_fill_manual(values = c("lightyellow2", "moccasin", "gold", "orange","red2",
                               "darkred", "violet", "purple")) +
  scale_color_manual(values = c("lightyellow3", "navajowhite2", "gold", "orange","red2",
                               "darkred", "violet", "purple")) +
  geom_bar(stat = 'count') +
  facet_grid(main_cell_type_expression ~ regulated_by_methylation) +
  xlab("Chromosome") +
  ylab("Number of genes") +
  theme_bw() +
  theme(
    axis.text.x = element_text(color = "black", angle = 90, vjust = 0.5, size = 10),
    axis.title = element_text(size = 10, color = "gray10"),
    strip.text.y = element_blank()) 

14 genes are on the X chromosome but escape X chromosome inactivation and are activates post-meioticly.

CT_genes %>% 
  filter(X_linked) %>% 
  filter(testis_cell_type %in% c("Late_spermatocyte","Round_spermatid", 
                                 "Elongated_spermatid", "Sperm1", "Sperm2"))

3.4 Gene function

go_ora <- enrichGO(gene = CT_genes$ensembl_gene_id,
                   keyType = "ENSEMBL",
                   OrgDb = org.Hs.eg.db,
                   ont = "all",
                   readable = TRUE)
as_tibble(go_ora)
as_tibble(go_ora) %>% 
  arrange(desc(Count)) %>% 
  head(12) %>% 
  mutate(Ratio = Count/183) %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  theme(axis.text.y = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = 10, color = "gray10"))+ 
  geom_text(aes(0, y = Description, label = Description),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 4)

As we can see here, most of genes are indeed linked to functions from reproduction. I represented here the 12 categories with the most genes, all from biological processes. However, they are enriched in 89 different GO terms.

We’ve also added a column tumor suppressor and oncogene in CTexploreR. These information come from Cancermine.

table(CT_genes$oncogene) 
## 
## oncogene 
##       34
table(CT_genes$tumor_suppressor)
## 
## tumor_suppressor 
##               15
table(CT_genes$oncogene, CT_genes$tumor_suppressor) 
##           
##            tumor_suppressor
##   oncogene                9

4 Package application : What are the mechanisms involved in CT demethylation in cancer

4.1 Determination of PMDs in male fibroblasts celles

WGBS data from GM23248 cells (male so X chromosome is reliable) downloaded from ENCODE. Data was processed using a WGBS pipeline [MethPipe] (http://smithlabresearch.org/software/methpipe/) including TrimGalore to remove low quality reads and trim the adapter from the sequences.

The pipeline has been verified by comparing PMDs already defined by Lister et al. on IMR90 and PMDs obtained with MethPipe.

wget https://www.encodeproject.org/files/ENCFF142GZA/@@download/ENCFF142GZA.fastq.gz
wget https://www.encodeproject.org/files/ENCFF845RUF/@@download/ENCFF845RUF.fastq.gz
wget https://www.encodeproject.org/files/ENCFF968GRR/@@download/ENCFF968GRR.fastq.gz
wget https://www.encodeproject.org/files/ENCFF975YNS/@@download/ENCFF975YNS.fastq.gz
#!/bin/bash
mkdir /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/rep1_ENCLB777FMV;
cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/rep1_ENCLB777FMV
/opt/TrimGalore/trim_galore --fastqc --paired /data/cbio/2021-mth-julie-devis/WGBS-processing/GM23248_data/ENCFF142GZA.fastq /data/cbio/2021-mth-julie-devis/WGBS-processing/GM23248_data/ENCFF845RUF.fastq

mkdir /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/rep2_ENCLB436YZI;
cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/rep2_ENCLB436YZI
/opt/TrimGalore/trim_galore --fastqc --paired /data/cbio/2021-mth-julie-devis/WGBS-processing/GM23248_data/ENCFF975YNS.fastq /data/cbio/2021-mth-julie-devis/WGBS-processing/GM23248_data/ENCFF968GRR.fastq

#!/bin/bash

cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/ENCODE_GM23248/rep1_ENCLB777FMV;

#Alignement
/home/users/jdevis/walt/bin/walt \
-i /data/cbio/2021-mth-julie-devis/WGBS-processing/genomes/hg38_by_Chr.dbindex \
-1 ENCFF142GZA_val_1.fq \
-2 ENCFF845RUF_val_2.fq \
-o rep1_ENCLB777FMV_mapping.mr;

#Duplicate-remover
LC_ALL=C sort -k 1,1 -k 2,2n -k 3,3n -k 6,6 -o rep1_ENCLB777FMV.mr.sorted_start rep1_ENCLB777FMV_mapping.mr;
/opt/methpipe/build/duplicate-remover -S rep1_ENCLB777FMV_dremove_stat.txt -o rep1_ENCLB777FMV.mr.dremove rep1_ENCLB777FMV.mr.sorted_start;

#bsrate
/opt/methpipe/build/bsrate -c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o rep1_ENCLB777FMV.bsrate rep1_ENCLB777FMV.mr.dremove;

#Methylation levels
/opt/methpipe/build/methcounts -n \
-c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o rep1_ENCLB777FMV.meth rep1_ENCLB777FMV.mr.dremove;

#Merge CpG
/opt/methpipe/build/symmetric-cpgs -o rep1_ENCLB777FMV_CpG_merge.meth rep1_ENCLB777FMV.meth;

#Stats
/opt/methpipe/build/levels -o rep1_ENCLB777FMV.levels rep1_ENCLB777FMV.meth;


cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/ENCODE_GM23248/rep2_ENCLB436YZI;

#Alignement
/home/users/jdevis/walt/bin/walt \
-i /data/cbio/2021-mth-julie-devis/WGBS-processing/genomes/hg38_by_Chr.dbindex \
-1 ENCFF975YNS_val_1.fq \
-2 ENCFF968GRR_val_2.fq \
-o rep2_ENCLB436YZI_mapping.mr;

#Duplicate-remover
LC_ALL=C sort -k 1,1 -k 2,2n -k 3,3n -k 6,6 -o rep2_ENCLB436YZI.mr.sorted_start rep2_ENCLB436YZI_mapping.mr;
/opt/methpipe/build/duplicate-remover -S rep2_ENCLB436YZI_dremove_stat.txt -o rep2_ENCLB436YZI.mr.dremove rep2_ENCLB436YZI.mr.sorted_start;

#bsrate
/opt/methpipe/build/bsrate -c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o rep2_ENCLB436YZI.bsrate rep2_ENCLB436YZI.mr.dremove;

#Methylation levels
/opt/methpipe/build/methcounts -n \
-c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o rep2_ENCLB436YZI.meth rep2_ENCLB436YZI.mr.dremove;

#Merge CpG
/opt/methpipe/build/symmetric-cpgs -o rep2_ENCLB436YZI_CpG_merge.meth rep2_ENCLB436YZI.meth;

#Stats
/opt/methpipe/build/levels -o rep2_ENCLB436YZI.levels rep2_ENCLB436YZI.meth;


#Merge both replicats
cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/ENCODE_GM23248;

/opt/methpipe/build/merge-methcounts rep1_ENCLB777FMV/rep1_ENCLB777FMV_CpG_merge.meth \
rep2_ENCLB436YZI/rep2_ENCLB436YZI_CpG_merge.meth \
-o GM23248_merged.meth;

#pmd
/opt/methpipe/build/pmd -v -i 1000 -o GM23248.pmd GM23248_merged.meth
GM23248_meth <- read_delim("GM23248_merged.meth", "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
names(GM23248_meth) <- c("chromosome", "position", "strand", "sequence_context","methylation_level","number_of_reads")

We can use the PMD file coming from MethPipe and change it into a GRanges object so that we can easily do overlapping analysis

my_PMD_GM23248 <- read_delim("data/GM23248.pmd", "\t", 
                             escape_double = FALSE, col_names = FALSE, 
                             trim_ws = TRUE)
names(my_PMD_GM23248) <- c("chr", "start","end","name_quality","bins","strand")
my_PMD_GM23248 <- separate(my_PMD_GM23248, "name_quality", 
                           into = c("name", "likelihood_start", "certainty_start",
                                    "likelihood_end", "certainty_end"), 
                           sep = ":")
my_PMD_GM23248_GR <- makeGRangesFromDataFrame(my_PMD_GM23248 [, -10], keep.extra.columns = TRUE)

Not PMD and NA regions aren’t dissociated. Coverage of windows of 10kb were calculated along the genome. A threshold of 1000 reads per 10kb is necessary to not be considered NA.

for (chromosome_name in paste0("chr",c(1:22,"X","Y"))){
  
  roi <- GM23248_meth %>% 
    filter(chromosome == chromosome_name) 
  
  res <- data.frame(region_i = floor(min(roi$position)/10000):floor(max(roi$position)/10000),
                    reads_tot = rep(NA,length(floor(min(roi$position)/10000):floor(max(roi$position)/10000))),
                    chromosome = rep(unique(roi$chromosome),length(floor(min(roi$position)/10000):floor(max(roi$position)/10000))))
  
  res <- res %>% 
    mutate(start_region = region_i*10000) %>% 
    mutate(end_region = region_i*10000+10000) %>% 
    column_to_rownames("region_i")
  
  for (i in floor(min(roi$position)/10000):floor(max(roi$position)/10000)){
    value <-  i*10000
    tmp <- roi %>% 
      filter(position >= value & position < (value + 10000))
    res[as.character(i) , "reads_tot"] <- sum(tmp$number_of_reads)
    }
  
  res_GR <- makeGRangesFromDataFrame(res, start.field = "start_region", 
                                     end.field = "end_region", keep.extra.columns = TRUE)
  
  overlap_boul <- overlapsAny(res_GR, my_PMD_GM23248_GR, type = "within")
  
  res$in_PMD <- overlap_boul
  
  saveRDS(res, file = paste0("data/10kb_reads_PMD_", chromosome_name, ".rds"))

}

windows_reads_PMD_chr1 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr1.rds")
windows_reads_PMD_chr2 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr2.rds")
windows_reads_PMD_chr3 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr3.rds")
windows_reads_PMD_chr4 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr4.rds")
windows_reads_PMD_chr5 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr5.rds")
windows_reads_PMD_chr6 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr6.rds")
windows_reads_PMD_chr7 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr7.rds")
windows_reads_PMD_chr8 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr8.rds")
windows_reads_PMD_chr9 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr9.rds")
windows_reads_PMD_chr10 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr10.rds")
windows_reads_PMD_chr11 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr11.rds")
windows_reads_PMD_chr12 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr12.rds")
windows_reads_PMD_chr13 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr13.rds")
windows_reads_PMD_chr14 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr14.rds")
windows_reads_PMD_chr15 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr15.rds")
windows_reads_PMD_chr16 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr16.rds")
windows_reads_PMD_chr17 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr17.rds")
windows_reads_PMD_chr18 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr18.rds")
windows_reads_PMD_chr19 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr19.rds")
windows_reads_PMD_chr20 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr20.rds")
windows_reads_PMD_chr21 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr21.rds")
windows_reads_PMD_chr22 <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chr22.rds")
windows_reads_PMD_chrX <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chrX.rds")
windows_reads_PMD_chrY <- readRDS("/data/cbio/2021-mth-julie-devis/analyses-rmd/data/10kb_reads_PMD_chrY.rds")

windows_reads_PMD <- rbind(windows_reads_PMD_chr1, windows_reads_PMD_chr2, windows_reads_PMD_chr3, windows_reads_PMD_chr4, windows_reads_PMD_chr5, windows_reads_PMD_chr6, windows_reads_PMD_chr7, windows_reads_PMD_chr8, windows_reads_PMD_chr9, windows_reads_PMD_chr10, windows_reads_PMD_chr11, windows_reads_PMD_chr12, windows_reads_PMD_chr13, windows_reads_PMD_chr14, windows_reads_PMD_chr15, windows_reads_PMD_chr16, windows_reads_PMD_chr17, windows_reads_PMD_chr18, windows_reads_PMD_chr19, windows_reads_PMD_chr20, windows_reads_PMD_chr21, windows_reads_PMD_chr22, windows_reads_PMD_chrX, windows_reads_PMD_chrY)

rownames(windows_reads_PMD) <- NULL

saveRDS(windows_reads_PMD, file = "data/10kb_reads_PMD_all.rds")

Here we load the coverage for all the 10k windows and determine which one are non-PMD due to coverage issues. These region are to be removed from the analysis.

windows_reads_PMD <- readRDS("data/10kb_reads_PMD_all.rds")

NA_nonPMD_GR <- windows_reads_PMD %>% 
  filter(!in_PMD) %>% 
  filter(reads_tot <= 1000) %>% 
  makeGRangesFromDataFrame(start.field = "start_region", end.field = "end_region",
                           keep.extra.columns = TRUE) %>% 
  GenomicRanges::reduce()

4.2 Are CT genes associated with PMDs ?

4.2.1 CT regulated by methylation and PMD overlap

CT_genes <- CT_genes %>% 
  mutate(start_prom = case_when(strand == 1 ~ transcription_start_site - 1000,
                                strand == -1 ~ transcription_start_site - 500)) %>% 
  mutate(end_prom = case_when(strand == 1 ~ transcription_start_site + 500,
                              strand == -1 ~ transcription_start_site + 1000))

CT_genes$strand <- ifelse(CT_genes$strand == "1", "+", "-")
seqlevelsStyle(CT_genes$chr) <- "UCSC"

CT_meth_regulated <- CT_genes %>% 
  filter(regulated_by_methylation)
meth_reg_prom_GR <- makeGRangesFromDataFrame(CT_meth_regulated, start.field = "start_prom",
                                    end.field = "end_prom",
                                    keep.extra.columns = TRUE)
CTmeth_PMD_GR <- subsetByOverlaps(meth_reg_prom_GR, my_PMD_GM23248_GR)
CTmeth_not_PMD_GR <- subsetByOverlaps(subsetByOverlaps(meth_reg_prom_GR, 
                                                       my_PMD_GM23248_GR, 
                                                   invert = TRUE),
                                      NA_nonPMD_GR, invert = TRUE)
CTmeth_NA_PMD_GR <-subsetByOverlaps(subsetByOverlaps(meth_reg_prom_GR, 
                                                     my_PMD_GM23248_GR, 
                                                 invert = TRUE), 
                                    NA_nonPMD_GR)

Now that these overlapping have been done, I can report them into my data as a boulean column to use analyse it.

CT_meth_regulated <- CT_meth_regulated %>% 
  mutate(PMD = case_when(CT_meth_regulated$external_gene_name %in% CTmeth_PMD_GR$external_gene_name ~ TRUE,
                         CT_meth_regulated$external_gene_name %in% CTmeth_not_PMD_GR$external_gene_name ~ FALSE,
                         CT_meth_regulated$external_gene_name %in% CTmeth_NA_PMD_GR$external_gene_name ~ NA))
table(is.na(CT_meth_regulated$PMD))
## 
## FALSE  TRUE 
##   122    38
table(CT_meth_regulated$PMD)
## 
## FALSE  TRUE 
##    60    62
PMD_perc <- table(CT_meth_regulated$PMD)[[2]] / 
  sum(table(CT_meth_regulated$PMD))*100
PMD_perc
## [1] 50.81967
X_in_PMD_table <- CT_meth_regulated %>% 
  filter(X_linked) %>% 
  dplyr::select(PMD) %>% 
  table()
X_in_PMD_table
## PMD
## FALSE  TRUE 
##    20    32
PMD_chr_repartition <- CT_meth_regulated %>% 
  filter(!is.na(PMD)) %>% 
  dplyr::select(X_linked) %>% 
  table()
PMD_chr_repartition
## X_linked
## FALSE  TRUE 
##    70    52

4.2.2 CT not regulated by methylation and PMD overlap

CT_not_meth_regulated <- CT_genes %>% 
  filter(!regulated_by_methylation)

prom_GR_not_reg <- makeGRangesFromDataFrame(CT_not_meth_regulated, 
                                            start.field = "start_prom",
                                    end.field = "end_prom",
                                    keep.extra.columns = TRUE)
not_reg_PMD_GR <- subsetByOverlaps(prom_GR_not_reg, my_PMD_GM23248_GR)
not_reg_not_PMD_GR <- subsetByOverlaps(subsetByOverlaps(prom_GR_not_reg,
                                                        my_PMD_GM23248_GR, 
                                                   invert = TRUE), 
                                  NA_nonPMD_GR, invert = TRUE)
not_reg_NA_PMD_GR <-subsetByOverlaps(subsetByOverlaps(prom_GR_not_reg,
                                                      my_PMD_GM23248_GR, 
                                                 invert = TRUE),
                                NA_nonPMD_GR)

CT_not_meth_regulated <- CT_not_meth_regulated %>% 
  mutate(PMD = case_when(CT_not_meth_regulated$external_gene_name %in% 
                           not_reg_PMD_GR$external_gene_name ~ TRUE,
                         CT_not_meth_regulated$external_gene_name %in% 
                           not_reg_not_PMD_GR$external_gene_name ~ FALSE,
                         CT_not_meth_regulated$external_gene_name %in% 
                           not_reg_NA_PMD_GR$external_gene_name ~ NA))

table(is.na(CT_not_meth_regulated$PMD))
## 
## FALSE  TRUE 
##   132     6
table(CT_not_meth_regulated$PMD)
## 
## FALSE  TRUE 
##    79    53
CT_not_meth_reg_PMD_perc <- table(CT_not_meth_regulated$PMD)[[2]] / 
  sum(table(CT_not_meth_regulated$PMD))*100
CT_not_meth_reg_PMD_perc
## [1] 40.15152

4.2.3 Enrichment analysis

The numbers are useful but we don’t know if it is significant or not. In order to do so, we’re going to compare this percentage with an empiric distribution. For that we need groups of genes comparable.

This distribution is going to be realised by 1000 times computing the % of genes in PMD in 70 tissue-specific genes on autosomes and 52 tissue-specific genes on the chromosome X. We use tissue-specific genes because this characteristic can make them more prone to be in a repressive region like PMDs.

We then have to create a pool of non-CT tissue-specific genes with their localisation, for that we use the list of tissue specific genes that can be found on the Human Protein Atlas (3106 tissue enriched genes: At least four-fold higher mRNA level in a particular tissue compared to any other tissue).

It is necessary to use biomaRt because we need transcription start sites to be able to realise the same analysis on enriched genes than on CT genes. Also, we need to remove CT genes and genes that are in the non-covered regions.

enriched_genes <- read.delim("data/HPA_tissue_enriched_genes.tsv")

attributes_vector <- c("ensembl_gene_id","external_gene_name", "chromosome_name",
                       "strand", "transcription_start_site", 
                       "external_transcript_name", "transcript_is_canonical")
biomaRt_query <- getBM(attributes = attributes_vector, mart = ensembl)
enriched_genes_biomart <- biomaRt_query %>% 
  filter(ensembl_gene_id %in% enriched_genes$Ensembl)
enriched_genes_biomart <- enriched_genes_biomart %>% 
  mutate(start_prom = case_when(strand == 1 ~ transcription_start_site - 1000,
                                strand == -1 ~ transcription_start_site - 500)) %>% 
  mutate(end_prom = case_when(strand == 1 ~ transcription_start_site + 500,
                              strand == -1 ~ transcription_start_site + 1000)) %>% 
  filter(!external_gene_name %in% CT_genes$external_gene_name) %>% 
  filter(transcript_is_canonical == 1) %>%
  filter(chromosome_name%in%c(1:22, "X", "Y"))

enriched_genes_biomart$strand <- ifelse(enriched_genes_biomart$strand == "1", "+", "-")
seqlevelsStyle(enriched_genes_biomart$chromosome_name) <- "UCSC"

enriched_genes_GR <- makeGRangesFromDataFrame(enriched_genes_biomart, 
                                              start.field = "start_prom",
                                              end.field = "end_prom",
                                              keep.extra.columns = TRUE)
enriched_NA_GR <-subsetByOverlaps(subsetByOverlaps(enriched_genes_GR, 
                                                   my_PMD_GM23248_GR, 
                                                   invert = TRUE),
                                  NA_nonPMD_GR)

enriched_genes_analysable <- as.data.frame(
  enriched_genes_GR[!enriched_genes_GR$ensembl_gene_id %in% enriched_NA_GR$ensembl_gene_id, ])
random_percentage_chrprop <- function(i, data, nchrX, nautosomes){
  random_sample1 <- sample_n(filter(data, seqnames == "chrX"), nchrX)
  random_sample2 <- sample_n(filter(data, seqnames != "chrX"), nautosomes)
  random_sample <- rbind(random_sample1,random_sample2)
  random_sample_GR <- makeGRangesFromDataFrame(random_sample, 
                                               keep.extra.columns = TRUE)
  length(subsetByOverlaps(random_sample_GR, my_PMD_GM23248_GR)) / 
    length(random_sample_GR)*100
}

distrib_enriched <- unlist(bplapply(1:1000, random_percentage_chrprop, 
                                    data = enriched_genes_analysable, 
                                    nchrX = PMD_chr_repartition[[2]], 
                                    nautosomes = PMD_chr_repartition[[1]]))
hist(distrib_enriched,
     col="violetred1",
     border = "violetred3",
     breaks = 20,
     main = paste0("Percentage of promoters in PMDs when randomly taking ",
                   sum(table(CT_meth_regulated$PMD)), 
                   " tissue-specific 
genes 1000 times (taking chromosome proportion into account)"),
     cex.main = 0.75,
     xlab = "Percentages",
     ylab = "Frequency",
     xlim = c(20,60))
abline(v = PMD_perc, col = "slateblue1", lwd = 3)
text(PMD_perc, 120, "CT genes regulated 
by methylation", col = "slateblue4", cex = 0.8)
abline(v = CT_not_meth_reg_PMD_perc, col = "darkseagreen1", lwd = 3)
text(CT_not_meth_reg_PMD_perc, 120, "CT genes not regulated 
by methylation", col = "darkseagreen", cex = 0.8)

PMD_CT_results <- rbind(dplyr::select(CT_not_meth_regulated, 
                                      c(PMD, external_gene_name)),
                    dplyr::select(CT_meth_regulated, 
                                  c(PMD, external_gene_name)))
CT_genes <- left_join(CT_genes, PMD_CT_results)
## Joining with `by = join_by(external_gene_name)`

4.3 Methylation in an aging model : CD4+ cells

Now that we know that CT genes are indeed enriched in PMDs, we have to go further. Indeed, we know that proliferation has an impact on PMD and their methylation : there are losses of methylation with time. As CT genes are located in PMDs, we can hypothesise that their methylation might be impacted by the losses happening during proliferation. We thus want to focus on the impact on CT genes of the methylation changes happening on PMDss in proliferative or aging cells.

4.3.1 Data processing

Data coming from Heyn 2012 paper.

#!/bin/bash

#Newborn
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330578
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330579
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394135
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394136
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394137
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394138
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394139
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394140
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394141
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR394142

#Centenarian
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330574
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330576
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330577
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR330575

#Middle-age
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR389249
/opt/sratoolkit/bin/fastq-dump --split-files --outdir ../rawdata --gzip SRR389248

for f in `cat CD4_SRA.txt`;

do mkdir /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/$f; 
cd /data/cbio/2021-mth-julie-devis/WGBS-processing/processed_data/$f;

#Trimming
/opt/TrimGalore/trim_galore --fastqc --path_to_cutadapt /home/users/jdevis/cutadapt --paired /data/cbio/2021-mth-julie-devis/WGBS-processing/rawdata/$f\_1.fastq /data/cbio/2021-mth-julie-devis/WGBS-processing/rawdata/$f\_2.fastq;

#Alignement
/home/users/jdevis/walt/bin/walt \
-i /data/cbio/2021-mth-julie-devis/WGBS-processing/genomes/hg38_by_Chr.dbindex \
-1 $f\_1_val_1.fq \
-2 $f\_2_val_2.fq \
-o $f\_mapping.mr;

#Duplicate-remover
LC_ALL=C sort -k 1,1 -k 2,2n -k 3,3n -k 6,6 -o $f.mr.sorted_start $f\_mapping.mr;
/opt/methpipe/build/duplicate-remover -S $f\_dremove_stat.txt -o $f.mr.dremove $f.mr.sorted_start;

#bsrate
/opt/methpipe/build/bsrate -c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o $f.bsrate $f.mr.dremove;

#Methylation levels
/opt/methpipe/build/methcounts -n \
-c /data/cbio/GENOMES/GRCh38_by_Chr/hgdownload.cse.ucsc.edu/chromosomes \
-o $f.meth $f.mr.dremove;

#Merge CpG
/opt/methpipe/build/symmetric-cpgs -o $f\_CpG_merge.meth $f.meth;

#Stats
/opt/methpipe/build/levels -o $f.levels $f.meth

done

Bisulfite-Seq data coming from a male newborn, a middle age donor and a male centenarian.

Quality of the data is good, with a mean of 13 reads per cytosine. However, middle age data has no info about the sex of the donor. By comparing the number of reads by chromosome in this sample, a female one and the male ones, we saw that the proportion of reads on chromosomes X and Y showed that the middle age data come from a woman. We then have to be very careful when analysing the X chromosome with the middle age data. To facilitate the analysis and compare all samples, I’ll remove the adult data on chrX.

columns <- c("chromosome", "position", "strand", "sequence context","methylation_level","number_of_reads")
CD4_newborn <- read_delim("data/CD4_newborn.meth", 
    delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
names(CD4_newborn) <- columns
CD4_middle <- read_delim("data/CD4_middle.meth", 
    delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
names(CD4_middle) <- columns
CD4_middle <- filter(CD4_middle, chromosome != "chrX")
CD4_cent <- read_delim("data/CD4_cent.meth", 
    delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
names(CD4_cent) <- columns
CD4_newborn$age <- "newborn"
CD4_middle$age <- "middle_age"
CD4_cent$age <- "centenarian"
CD4_newborn_GR <- makeGRangesFromDataFrame(CD4_newborn, start.field = "position", 
                                           end.field = "position", keep.extra.columns = TRUE)
CD4_middle_GR <- makeGRangesFromDataFrame(CD4_middle, start.field = "position", 
                                          end.field = "position", keep.extra.columns = TRUE)
CD4_cent_GR <- makeGRangesFromDataFrame(CD4_cent, start.field = "position", 
                                        end.field = "position", keep.extra.columns = TRUE)

4.3.2 CT genes methylation level

Here I extract methylation for all CpGs located in CT genes promoter (regulated by methylation because it’s these that we can find in PMDs).

CT_prom_GR <- makeGRangesFromDataFrame(CT_genes, start.field = "start_prom",
                                    end.field = "end_prom",
                                    keep.extra.columns = TRUE)

newborn <- mergeByOverlaps(CD4_newborn_GR, CT_prom_GR, ignore.strand = TRUE)[,1]
newborn$external_gene_name <- mergeByOverlaps(CD4_newborn_GR, CT_prom_GR, 
                                              ignore.strand = TRUE)$external_gene_name
newborn <- as.data.frame(newborn)
middle <- mergeByOverlaps(CD4_middle_GR, CT_prom_GR, ignore.strand = TRUE)[,1]
middle$external_gene_name <- mergeByOverlaps(CD4_middle_GR, CT_prom_GR, 
                                             ignore.strand = TRUE)$external_gene_name
middle <- as.data.frame(middle)
cent <- mergeByOverlaps(CD4_cent_GR, CT_prom_GR, ignore.strand = TRUE)[,1]
cent$external_gene_name <- mergeByOverlaps(CD4_cent_GR, CT_prom_GR, 
                                           ignore.strand = TRUE)$external_gene_name
cent <- as.data.frame(cent)
  
CTmeth_inCD4 <- rbind(newborn, middle, cent)
rm(newborn, middle, cent)
CTmeth_inCD4$age <- factor(CTmeth_inCD4$age, 
                           levels = c("newborn","middle_age","centenarian"))
CTmeth_inCD4[CTmeth_inCD4$number_of_reads == 0,]$methylation_level <- NA

length(unique(CTmeth_inCD4$external_gene_name))
## [1] 298

We can see here that we have information for all 298 CT genes. We now have to calculate the mean methylation level of each promoter to visualise it and compare to PMDs.

Some df that we obtain are empty, their CpG are not covered by the CD4 data, and so need to be removed. I also want to remove those who have data in only one age category, as we want to compare different age, they are of no need. When one of the age is missing, I’m adding it as NA to facilitate the rest (mostly gene on chrX and we removed data from adult)

reorganize_data_genes <- function(gene_name, data){
  reorganized_data <- data %>%
    filter(external_gene_name == gene_name) %>% 
    dplyr::select("start","methylation_level","age") %>% 
    na.omit() %>%  # to remove CpG rows with NA meth_level
    pivot_wider(names_from = start, values_from = methylation_level) %>% 
    column_to_rownames("age")
}

CT_per_gene_CD4_datasets <- sapply(unique(CTmeth_inCD4$external_gene_name), 
                                   reorganize_data_genes, data = CTmeth_inCD4) 

p <- rep(NA, length(CT_per_gene_CD4_datasets))
for(i in 1:length(CT_per_gene_CD4_datasets)){
 p[i] <- nrow(CT_per_gene_CD4_datasets[[i]]) > 1 & 
   is.data.frame(CT_per_gene_CD4_datasets[[i]])
}
CT_per_gene_CD4_datasets <- CT_per_gene_CD4_datasets[p] 

k <- rep(NA, length(CT_per_gene_CD4_datasets))
for(i in 1:length(CT_per_gene_CD4_datasets)){
  k[i] <- nrow(CT_per_gene_CD4_datasets[[i]]) == 2
}
for (gene in names(CT_per_gene_CD4_datasets[k])){
  tmp <- rep(NA, ncol(CT_per_gene_CD4_datasets[[gene]]))
  CT_per_gene_CD4_datasets[[gene]] <- rbind(CT_per_gene_CD4_datasets[[gene]], tmp)
  rownames(CT_per_gene_CD4_datasets[[gene]])[3] <- 
    c("newborn","middle_age", "centenarian")[(!c("newborn","middle_age","centenarian") %in% 
                                                rownames(CT_per_gene_CD4_datasets[[gene]]))]
  # naming the new row
}

for (gene in names(CT_per_gene_CD4_datasets)){
  CT_per_gene_CD4_datasets[[gene]] <- arrange(CT_per_gene_CD4_datasets[[gene]], 
                                              rownames(CT_per_gene_CD4_datasets[[gene]]))
  # Put rows in the right order
} 

rm(p, i, k, tmp)

CT_meth_means_pre <- sapply(CT_per_gene_CD4_datasets, rowMeans, na.rm = TRUE)

CT_meth_means <- CT_meth_means_pre %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("external_gene_name")

CT_meth_means$middle_age[is.na(CT_meth_means$middle_age)] <- NA

n_means_meth <- CT_meth_means %>% 
  filter(external_gene_name %in% CT_meth_regulated$external_gene_name) %>% 
  nrow()
  
CT_genes <- left_join(CT_genes, CT_meth_means)
## Joining with `by = join_by(external_gene_name)`

On the 298 CT genes, I have enough data to analyse 260 of them (127/160 meth regulated).

CT_genes_to_plot <- CT_genes %>% 
  dplyr::select(external_gene_name, chr, X_linked, regulated_by_methylation, 
                PMD, newborn, middle_age, centenarian) %>%
  filter(!is.na(PMD)) %>% 
  pivot_longer(names_to = "age", values_to = "methylation_level", 
               - c("external_gene_name","chr","X_linked", 
                   "regulated_by_methylation", "PMD"))
CT_genes_to_plot$age <- factor(CT_genes_to_plot$age, 
                               levels = c("newborn","middle_age","centenarian"))
CT_genes_to_plot %>%
  mutate(PMD = case_when(PMD == TRUE ~ "PMD",
                         PMD == FALSE ~ "not_PMD")) %>%
  mutate(regulated_by_methylation = case_when(regulated_by_methylation == TRUE ~ "meth_reg",
                                              regulated_by_methylation == FALSE ~ "not_meth_reg")) %>% 
  ggplot(aes(x = age, y = methylation_level, fill = age)) +
  geom_boxplot(outlier.shape = NA)+
  geom_jitter(aes(color = age), alpha = 0.3) +
  facet_grid(regulated_by_methylation ~ PMD) + 
  scale_fill_manual(values = c("rosybrown1", "palevioletred2", "violetred3")) +
  scale_color_manual(values = c("rosybrown3","palevioletred3","violetred4")) +
  theme_bw()
## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 78 rows containing missing values (`geom_point()`).

4.3.3 Comparison CT and PMDs

PMD_newborn <- mergeByOverlaps(CD4_newborn_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)[,1]
PMD_newborn$name <- mergeByOverlaps(CD4_newborn_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)$name
PMD_newborn <- as.data.frame(PMD_newborn)
PMD_middle <- mergeByOverlaps(CD4_middle_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)[,1]
PMD_middle$name <- mergeByOverlaps(CD4_middle_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)$name
PMD_middle <- as.data.frame(PMD_middle)
PMD_cent<- mergeByOverlaps(CD4_cent_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)[,1]
PMD_cent$name <- mergeByOverlaps(CD4_cent_GR, my_PMD_GM23248_GR, ignore.strand = TRUE)$name
PMD_cent <- as.data.frame(PMD_cent)
  
PMD_CD4 <- rbind(PMD_newborn, PMD_middle, PMD_cent)
rm(PMD_newborn, PMD_middle, PMD_cent)
PMD_CD4$age <- factor(PMD_CD4$age, levels = c("newborn","middle_age","centenarian"))
PMD_CD4[PMD_CD4$number_of_reads == 0,]$methylation_level <- NA

Let’s compare the evolution of CT methylation with that of the PMDs they’re in.

CT_reg_PMD_plot <- CT_genes_to_plot %>% 
  filter(regulated_by_methylation & PMD) %>% 
  ggplot(aes(x = methylation_level, color = age)) +
  geom_density() + 
  scale_color_manual(values = c("rosybrown1", "palevioletred2", "violetred3"))+
  xlab ("Promoter mean methylation level (CT meth reg and in PMDs)") +
  theme_minimal()


CT_reg_PMD_GR <- CT_prom_GR[!is.na(CT_prom_GR@elementMetadata$PMD) &
                              CT_prom_GR@elementMetadata$PMD & 
                              CT_prom_GR@elementMetadata$regulated_by_methylation, ]

CT_PMD_meth_plot <- PMD_CD4 %>% 
  filter(name %in% subsetByOverlaps(my_PMD_GM23248_GR,CT_reg_PMD_GR)$name) %>% 
  group_by(name, age) %>% 
  summarise("mean_methylation" = mean(methylation_level, na.rm = TRUE)) %>% 
  ggplot(aes(x = mean_methylation, color = age)) + 
  geom_density() +
  scale_color_manual(values = c("rosybrown1", "palevioletred2", "violetred3")) +
  theme_minimal() +
  xlab("CT's PMDs mean methylation level")
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
(CT_reg_PMD_plot + scale_x_continuous(limits = c(0,1))) / (CT_PMD_meth_plot + scale_x_continuous(limits = c(0,1)))
## Warning: Removed 34 rows containing non-finite values (`stat_density()`).

4.3.4 Comparison CT and non CT in PMDs

We don’t know if what we see happening in CT genes is common with other genes located in PMDs. If we can also see a demethylation of these genes promoters and that it is bigger than CT demethylation, it means that there would be a protection of CT genes against demethylation. We thus want to compare CT methylation to the one of other genes in PMDs.

4.3.4.1 Non-CT methylation

Get methylation data for all other genes than CT in PMDs.

non_CT_genes <- biomaRt_query %>% 
  mutate(start_prom = case_when(strand == 1 ~ transcription_start_site - 1000,
                                strand == -1 ~ transcription_start_site - 500)) %>% 
  mutate(end_prom = case_when(strand == 1 ~ transcription_start_site + 500,
                              strand == -1 ~ transcription_start_site + 1000)) %>% 
  filter(!external_gene_name %in% CT_genes$external_gene_name) %>% 
  filter(transcript_is_canonical == 1) %>%
  filter(chromosome_name%in%c(1:22, "X", "Y"))

non_CT_genes$strand <- ifelse(non_CT_genes$strand == "1", "+", "-")
seqlevelsStyle(non_CT_genes$chromosome_name) <- "UCSC"
non_CT_GR <- makeGRangesFromDataFrame(non_CT_genes,
                                      start.field = "start_prom",
                                      end.field = "end_prom",
                                      keep.extra.columns = TRUE)
non_CT_in_PMD <- subsetByOverlaps(non_CT_GR, my_PMD_GM23248_GR)

newborn <- mergeByOverlaps(CD4_newborn_GR, non_CT_in_PMD, ignore.strand = TRUE)[,1]
newborn$ensembl_gene_id <- mergeByOverlaps(CD4_newborn_GR, non_CT_in_PMD, 
                                           ignore.strand = TRUE)$ensembl_gene_id
newborn <- as.data.frame(newborn)
middle <- mergeByOverlaps(CD4_middle_GR, non_CT_in_PMD, ignore.strand = TRUE)[,1]
middle$ensembl_gene_id <- mergeByOverlaps(CD4_middle_GR, non_CT_in_PMD, 
                                          ignore.strand = TRUE)$ensembl_gene_id
middle <- as.data.frame(middle)
cent <- mergeByOverlaps(CD4_cent_GR, non_CT_in_PMD, ignore.strand = TRUE)[,1]
cent$ensembl_gene_id <- mergeByOverlaps(CD4_cent_GR, non_CT_in_PMD, 
                                        ignore.strand = TRUE)$ensembl_gene_id
cent <- as.data.frame(cent)
  
nonCTmeth_inCD4 <- rbind(newborn, middle, cent)
rm(newborn, middle, cent)
nonCTmeth_inCD4$age <- factor(nonCTmeth_inCD4$age, 
                           levels = c("newborn","middle_age","centenarian"))
nonCTmeth_inCD4[nonCTmeth_inCD4$number_of_reads == 0,]$methylation_level <- NA

reorganize_data_ensembl <- function(gene_name, data){
  reorganized_data <- data %>%
    filter(ensembl_gene_id == gene_name) %>% 
    dplyr::select("start","methylation_level","age") %>% 
    na.omit() %>%  # to remove CpG rows with NA meth_level
    pivot_wider(names_from = start, values_from = methylation_level) %>% 
    column_to_rownames("age")
}

# nonCT_per_gene_CD4_datasets <- bplapply(unique(nonCTmeth_inCD4$ensembl_gene_id), 
#                                       reorganize_data_ensembl, data = nonCTmeth_inCD4)
# names(nonCT_per_gene_CD4_datasets) <- unique(nonCTmeth_inCD4$ensembl_gene_id)
# 
# p <- rep(NA, length(nonCT_per_gene_CD4_datasets))
# for(i in 1:length(nonCT_per_gene_CD4_datasets)) {
#  p[i] <- nrow(nonCT_per_gene_CD4_datasets[[i]]) > 1 &
#    is.data.frame(nonCT_per_gene_CD4_datasets[[i]])
# }
# nonCT_per_gene_CD4_datasets <- nonCT_per_gene_CD4_datasets[p]
# 
# k <- rep(NA, length(nonCT_per_gene_CD4_datasets))
# for(i in 1:length(nonCT_per_gene_CD4_datasets)) {
#   k[i] <- nrow(nonCT_per_gene_CD4_datasets[[i]]) == 2
# }
# 
# for (gene in names(nonCT_per_gene_CD4_datasets[k])) {
#   tmp <- rep(NA, ncol(nonCT_per_gene_CD4_datasets[[gene]]))
#   nonCT_per_gene_CD4_datasets[[gene]] <- rbind(nonCT_per_gene_CD4_datasets[[gene]], tmp)
#   rownames(nonCT_per_gene_CD4_datasets[[gene]])[3] <-
#     c("newborn","middle_age", "centenarian")[(!c("newborn","middle_age","centenarian") %in%
#                                                 rownames(nonCT_per_gene_CD4_datasets[[gene]]))]
#   # naming the new row
# }
# 
# for (gene in names(nonCT_per_gene_CD4_datasets)) { 
#   nonCT_per_gene_CD4_datasets[[gene]] <- arrange(nonCT_per_gene_CD4_datasets[[gene]],
#                                               rownames(nonCT_per_gene_CD4_datasets[[gene]]))
#   # Put rows in the right order
# }
# 
# rm(p, i, k, tmp)

load("data_rmd/nonCT_per_gene_CD4_datasets.rda")

nonCT_meth_means <- sapply(nonCT_per_gene_CD4_datasets, rowMeans, na.rm = TRUE)
nonCT_meth_means <- nonCT_meth_means %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("ensembl_gene_id")

nonCT_meth_means$middle_age[is.na(nonCT_meth_means$middle_age)] <- NA

4.3.4.2 CT and non CT comparison

threshold <- CT_genes %>% 
  filter(regulated_by_methylation & PMD) %>% 
  dplyr::select(newborn) %>% 
  min(na.rm = TRUE)

nonCT_meth_means_thr <- nonCT_meth_means %>% 
  filter(newborn >= threshold)

nonCT_meth_means_to_plot <-  nonCT_meth_means_thr %>% 
  pivot_longer(names_to = "age", values_to = "methylation_level", 
               - "ensembl_gene_id")
nonCT_meth_means_to_plot$age <- factor(nonCT_meth_means_to_plot$age, 
                               levels = c("newborn","middle_age","centenarian"))


med_meth_nb_CT <- CT_genes_to_plot %>% 
  filter(regulated_by_methylation & PMD & age == "newborn") %>% 
  pull(methylation_level) %>% 
  median(na.rm = TRUE)
med_meth_nb_CT
## [1] 0.922374
med_meth_nb_nonCT <-nonCT_meth_means_to_plot %>% 
  filter(age == "newborn") %>% 
  pull(methylation_level) %>% 
  median(na.rm = TRUE)
med_meth_nb_nonCT
## [1] 0.8887351
med_meth_cent_CT <- CT_genes_to_plot %>% 
  filter(regulated_by_methylation & PMD & age == "centenarian") %>% 
  pull(methylation_level) %>% 
  median(na.rm = TRUE)
med_meth_cent_CT
## [1] 0.8357093
med_meth_cent_nonCT <-nonCT_meth_means_to_plot %>% 
  filter(age == "centenarian") %>% 
  pull(methylation_level) %>% 
  median(na.rm = TRUE)
med_meth_cent_nonCT
## [1] 0.7343142
CT_reg_PMD_boxplot <- CT_genes_to_plot %>%
  filter(regulated_by_methylation & PMD) %>% 
  ggplot(aes(x = age, y = methylation_level, fill = age)) +
  geom_boxplot(outlier.shape = NA)+
  geom_jitter(aes(color = age), alpha = 0.3) +
  scale_fill_manual(values = c("rosybrown1", "palevioletred2", "violetred3")) +
  scale_color_manual(values = c("rosybrown3","palevioletred3","violetred4")) +
  theme_bw()


nonCT_boxplot <- nonCT_meth_means_to_plot %>%
  ggplot(aes(x = age, y = methylation_level, fill = age)) +
  geom_jitter(aes(color = age), alpha = 0.3) +
  geom_boxplot(outlier.shape = NA) +
  scale_fill_manual(values = c("rosybrown1", "palevioletred2", "violetred3")) +
  scale_color_manual(values = c("rosybrown3","palevioletred3","violetred4")) +
  theme_bw()


(CT_reg_PMD_boxplot + ggtitle("CT genes in PMDs") + 
    scale_y_continuous(limits = c(0,1)) + theme(legend.position = "none")) + 
  (nonCT_boxplot + ggtitle("non CT genes in PMDs") + 
     scale_y_continuous(limits = c(0,1)))
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 34 rows containing missing values (`geom_point()`).
## Warning: Removed 879 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 892 rows containing missing values (`geom_point()`).

4.3.4.3 Differential analysis

However, this is only an observation based on the data, we need to do a quantification of this decrease and see its signification. For this, we have to do a differential methylation analysis.

First step is to check that there is indeed a loss between newborn and centenarian for each gene. To do so, we use a unidirectional (newborn > centenarian) paired statistical test on the methylation level of cytosines in both ages. Data can’t really be normalise so we use a Wilcoxon test (non-parametric t test). Our significativity threshold is 5%, which means that a test is significative if the adjusted (B-H method) pvalue is inferior to 0.05.

Second step is to quantify the loss of methylation between newborn and centenarian. To do so, we calculate a delta_meth for each gene, which is simply the difference between newborn and cent level.

CT_to_pval <- CTmeth_inCD4 %>%
  filter(external_gene_name %in% filter(CT_genes, 
                                        PMD & regulated_by_methylation)$external_gene_name) %>%
  dplyr::select(external_gene_name, seqnames, start, age, methylation_level) %>% 
  pivot_wider(names_from = "age", values_from = "methylation_level", values_fill = NA) %>% 
  filter(!is.na(newborn) & !is.na(centenarian)) %>% 
  left_join(dplyr::select(CT_genes, external_gene_name, ensembl_gene_id))
## Joining with `by = join_by(external_gene_name)`
non_CT_to_pval <- nonCTmeth_inCD4 %>% 
  filter(ensembl_gene_id %in% nonCT_meth_means_thr$ensembl_gene_id) %>%
  dplyr::select(ensembl_gene_id, seqnames, start, age, methylation_level) %>%
  pivot_wider(names_from = "age", values_from = "methylation_level", values_fill = NA) %>% 
  filter(!is.na(newborn) & !is.na(centenarian))

test_nb_cent <- function(gene_i, data){
    wilcox.test(
      as.matrix(filter(data, ensembl_gene_id == gene_i)$newborn), 
      as.matrix(filter(data, ensembl_gene_id == gene_i)$centenarian), 
      paired = TRUE, 
      alternative = "greater", 
      exact = FALSE)$p.value
}

pval_CT <- sapply(unique(CT_to_pval$ensembl_gene_id), test_nb_cent, CT_to_pval)
pval_nonCT <- sapply(unique(non_CT_to_pval$ensembl_gene_id), test_nb_cent, non_CT_to_pval)


CT_stat_analysis <- CT_genes %>% 
  filter(ensembl_gene_id %in% CT_to_pval$ensembl_gene_id) %>% 
  dplyr::select(ensembl_gene_id, external_gene_name, chr, X_linked, 
                 newborn, middle_age, centenarian) %>% 
  filter(!is.na(newborn) & !is.na(centenarian)) %>% 
  mutate("padj" = p.adjust(pval_CT, method = "BH"))
sign_table_CT <- table(CT_stat_analysis$padj < 0.05)

nonCT_stat_analysis <- nonCT_meth_means_thr %>% 
  filter(ensembl_gene_id %in% non_CT_to_pval$ensembl_gene_id) %>% 
  mutate("padj" = p.adjust(pval_nonCT, method = "BH"))
sign_table_nonCT <- table(nonCT_stat_analysis$padj < 0.05)

table(CT_stat_analysis$padj <= 0.05)[[2]]/length(CT_stat_analysis$padj)
## [1] 0.5737705
table(nonCT_stat_analysis$padj <= 0.05)[[2]]/length(nonCT_stat_analysis$padj)
## [1] 0.6306181
CT_stat_analysis <- CT_stat_analysis %>% 
  mutate("delta_meth" = centenarian - newborn)

nonCT_stat_analysis <- nonCT_stat_analysis %>% 
  mutate("delta_meth" = centenarian - newborn)

med_delta_CT <- CT_stat_analysis %>% 
  filter(padj <= 0.05) %>% 
  pull(delta_meth) %>% 
  median()
med_delta_CT
## [1] -0.09544093
med_delta_nonCT <- nonCT_stat_analysis %>% 
  filter(padj <= 0.05) %>% 
  pull(delta_meth) %>% 
  median()
med_delta_nonCT
## [1] -0.1633161
CT_stat_analysis %>% 
  filter(padj <= 0.05) %>%
  mutate(type = "CT_genes") %>%
  dplyr::select(ensembl_gene_id, centenarian, middle_age, newborn, padj, delta_meth, type) %>% 
  rbind(nonCT_stat_analysis %>%
          filter(padj <= 0.05) %>%
          mutate(type = "nonCT_genes")) %>% 
  
  ggplot(aes(y = delta_meth, x = type, fill = type)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(color = type), alpha = 0.2, size = 0.5) +
  scale_fill_manual(values = c("plum","lightgoldenrod1")) +
  scale_color_manual(values = c("darkorchid3","gold1")) +
  ggtitle("Delta_meth of PMD genes with significant demethylation") +
  xlab("") + 
  ylab("Delta_meth centenarian - newborn") +
  theme_bw() +
  theme(legend.position = "none")

There is indeed a difference in the amplitude of delta_meth between CT and nonCT genes in PMD.

We then want to test the difference between delta_meth for CT and nonCT genes in PMD, when pval is significative. For that we use a unidirectionnal (CT > non-CT) Mann-Whitney test, non-parametric (because no normality in delta_meth non-CT) version of student t test.

wilcox.test(filter(CT_stat_analysis, padj <= 0.05)$delta_meth,
            filter(nonCT_stat_analysis, padj <= 0.05)$delta_meth,
            alternative = "greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  filter(CT_stat_analysis, padj <= 0.05)$delta_meth and filter(nonCT_stat_analysis, padj <= 0.05)$delta_meth
## W = 210890, p-value = 6.937e-12
## alternative hypothesis: true location shift is greater than 0

pvalue here is significative. This shows then that delta_meth are smaller in nonCT genes and demethylation in nonCT genes is then higher than in CT genes.

However, the difference between delta_meth distributions could be the result of the biggest number of genes in nonCT (4249 VS 26 not sign, or 7254 VS 35 padj sign). In order to see that, let’s do a down sampling : taking 35 nonCT genes with a pval <= 0.05 and 26 with a pval > 0.05 a thousand times and calculate the mean of each distributions. I can visualise that after and see where CT genes goes compare to that.

random_delta_distrib_means <- function(i, data, n_sign, n_notsign){
  random_sample1 <- sample_n(filter(data, padj <= 0.05), n_sign)
  random_sample2 <- sample_n(filter(data, padj > 0.05), n_notsign)
  random_sample <- rbind(random_sample1, random_sample2)
  
  mean(random_sample$delta_meth)
}

distrib_means <- unlist(bplapply(1:1000, random_delta_distrib_means,
                                data = nonCT_stat_analysis, 
                                n_sign = sign_table_CT[[2]], 
                                n_notsign = sign_table_CT[[1]]))

mean(CT_stat_analysis$delta_meth)
## [1] -0.06122821
table(distrib_means >= mean(CT_stat_analysis$delta_meth))
## 
## FALSE 
##  1000
hist(distrib_means,
     col="dodgerblue1",
     border = "dodgerblue3",
     breaks = 20,
     main = paste0("Means of 1000 delta_meth distributions of nonCT genes when randomly 
     taking ", sign_table_CT[[2]], "significative and ", sign_table_CT[[1]], " not significative genes"),
     cex.main = 0.75,
     xlab = "Means",
     ylab = "Frequency",
     xlim = c(-0.2, 0))
abline(v = mean(CT_stat_analysis$delta_meth), col = "violetred1", lwd = 3)
text(mean(CT_stat_analysis$delta_meth), 125, "Mean of CT genes 
delta_meth distribution", col = "violetred3")

This shows that even with the same amount of genes, delta_meth are significantly smaller for CT genes than for nonCT genes, the effect is then not due to the amount of genes.

This analysis shows that CT genes located in PMDs undergo a demethylation related to aging that is smaller than the other genes in PMDs. Thus, age and demethylation that is associated to it couldn’t explain demethylation and activation of CT genes. There is then probably something that protects CT genes from PMD demethylation.

4.4 Why is there a protection on CT genes : CpG density hypothesis

CT genes are thus protected against demethylation in PMDs. Next question is to determine what confers this protection to CT genes. what One of our hypothesis for that protection is that CT genes are protected thanks to their density in CpG. Indeed, CT genes promoters are rich in CpGs while PMDs are poor. It is possible that this density protects CT genes from demethylation.

Promoter’s density in CpG is already computed in CTexploreR but we have to do it also for non-CT genes.

CT_density_analysis <- CT_genes %>% 
  filter(ensembl_gene_id %in% CT_stat_analysis$ensembl_gene_id) %>%
  mutate("padj" = CT_stat_analysis$padj) %>% 
  mutate("delta_meth" = CT_stat_analysis$delta_meth)
  
nonCT_density_analysis <- as.data.frame(non_CT_in_PMD) %>% 
  filter(ensembl_gene_id %in% nonCT_stat_analysis$ensembl_gene_id)%>%
  left_join(nonCT_stat_analysis)
## Joining with `by = join_by(ensembl_gene_id)`
colnames(nonCT_density_analysis)[1:3] <- c("chr", "start_prom", "end_prom")
nonCT_density_analysis$chr <- as.character(nonCT_density_analysis$chr)

cpg_density <- function(gene, data){
  temp <- data %>% 
    filter(ensembl_gene_id == gene) %>% 
    dplyr::select(ensembl_gene_id, start_prom, end_prom, chr)
  
  seqlevelsStyle(temp$chr) <- "NCBI"

  chrom <- genome[startsWith(names(genome), paste0(temp$chr, " "))][[1]]

  interest <- subseq(chrom, temp$start_prom, temp$end_prom)

  p <- matchPattern("CG", interest)
  length(p) / length(interest)
}

genome <- readDNAStringSet("../../cluster/GENOMES/Homo_sapiens.GRCh38.dna.primary_assembly.fa")

CT_cpg_density <- sapply(CT_density_analysis$ensembl_gene_id, cpg_density, 
                         CT_density_analysis)

nonCT_cpg_density <- unlist(bplapply(nonCT_density_analysis$ensembl_gene_id, cpg_density,
                            nonCT_density_analysis))

There is already the CpG_density column from CT_genes so careful, what I’ve done here is similar to what was already in it, it confirms.

4.4.1 Comparison CT and non CT with similar densities

CT_density_analysis <- CT_density_analysis %>% 
  mutate("CpG_density_analysis" = CT_cpg_density*100)

nonCT_density_analysis <- nonCT_density_analysis %>% 
  mutate("CpG_density_analysis" = nonCT_cpg_density*100)

CT_density_analysis %>% 
  filter(padj <= 0.05) %>%
  mutate(type = "CT_genes") %>%
  dplyr::select(ensembl_gene_id, centenarian, middle_age, newborn, padj, 
                delta_meth, CpG_density_analysis, type) %>% 
  rbind(nonCT_density_analysis %>%
          filter(padj <= 0.05) %>%
          mutate(type = "nonCT_genes") %>% 
          dplyr::select(ensembl_gene_id, centenarian, middle_age, newborn, padj, 
                delta_meth, CpG_density_analysis, type)) %>%
  
  ggplot(aes(x = CpG_density_analysis, color = type)) +
  geom_density() + 
  scale_color_manual(values = c("darkorchid3","gold1")) +
  xlab("CpG density in promoters") +
  ylab("Quantity (density plot)") +
  ggtitle("Distribution of CpG density on CT and non CT promoters
significantly demethylated in PMDs")+
  theme_minimal()

This shows/confirms that nonCT promoters in PMDs are poorer in CpG.

CT_density_analysis %>% 
  filter(padj <= 0.05) %>%
  mutate(type = "CT_genes") %>%
  
  ggplot(aes(x = delta_meth, y = CpG_density_analysis, color = type)) +
  geom_point() +
  geom_smooth(method = "loess", span = 2.5, color = "darkorchid4") +
  scale_color_manual(values = c("darkorchid3")) +
  xlab("Delta_meth of significatively 
demethylated genes") +
  ylab("CpG density of promoter") +
  facet_grid(~ type) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 8)) +
  scale_x_continuous(limits = c(-0.63, 0.05)) +
  
  nonCT_density_analysis %>%
          filter(padj <= 0.05) %>%
          mutate(type = "nonCT_genes") %>%
  
  ggplot(aes(x = delta_meth, y = CpG_density_analysis, color = type)) +
  geom_point() +
  geom_smooth(method = "loess", span = 0.75, color = "goldenrod2") +
  scale_color_manual(values = c("gold1")) +
  facet_grid(~ type) +
  theme_bw() + 
  ylab("") +
  xlab("Delta_meth of significatively 
demethylated genes") +
  scale_y_continuous(limits = c(0, 8)) +
  scale_x_continuous(limits = c(-0.63, 0.05)) +  
  theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

We see here a diagonal tendancy with a positive correlation, it shows that there is a link between CpG density and delta_meth. This tendancy is visible in CT and nonCT : the more CpG there are in promoters, less demethylation we observe. Isolated CpGs are then probably those that are easier demethylated in PMDs. We thus propose that the strong density in CpG that characterize CT promoters is the origin of their resistance to demethylation

Let’s compare only similar CpG densities, and do the same categories as in CTexploreR.

CT_density_analysis <- CT_density_analysis %>% 
  mutate(density_state = 
             case_when (CT_density_analysis$CpG_density_analysis >= 4 ~ "rich",
                        CT_density_analysis$CpG_density_analysis <= 2 ~ "poor",
                        CT_density_analysis$CpG_density_analysis > 2 & 
                          CT_density_analysis$CpG_density_analysis < 4 ~ "normal"))


nonCT_density_analysis <- nonCT_density_analysis %>% 
  mutate(density_state = 
             case_when (nonCT_density_analysis$CpG_density_analysis >= 4 ~ "rich",
                        nonCT_density_analysis$CpG_density_analysis <= 2 ~ "poor",
                        nonCT_density_analysis$CpG_density_analysis > 2 &
                          nonCT_density_analysis$CpG_density_analysis < 4 ~ "normal"))

By comparing only delta_meth from genes normal/rich in CpG, we can erase the effect of CpG density from the demethylation analysis.

CT_density_analysis %>% 
  filter(padj <= 0.05) %>%
  filter(density_state != "poor") %>% 
  mutate(type = "CT_genes") %>%
  dplyr::select(ensembl_gene_id, delta_meth, type) %>% 
  rbind(nonCT_density_analysis %>%
          filter(padj <= 0.05) %>%
          filter(density_state != "poor") %>%
          mutate(type = "nonCT_genes")%>%
          dplyr::select(ensembl_gene_id, delta_meth, type))%>%
  
  ggplot(aes(y = delta_meth, x = type, fill = type)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(color = type), alpha = 0.2, size = 0.5) +
  scale_fill_manual(values = c("plum","lightgoldenrod1")) +
  scale_color_manual(values = c("darkorchid3","gold1")) +
  ggtitle("Delta_meth of PMD genes with significant demethylation
and rich/normal in CpG") +
  xlab("") + 
  ylab("Delta_meth centenarian - newborn") +
  theme_bw() +
  theme(legend.position = "none")

rich_med_delta_CT <- CT_density_analysis %>% 
  filter(padj <= 0.05 & density_state != "poor") %>% 
  pull(delta_meth) %>% 
  median()
rich_med_delta_CT
## [1] -0.09461828
rich_med_delta_nonCT <-nonCT_density_analysis %>% 
  filter(padj <= 0.05 & density_state != "poor") %>% 
  pull(delta_meth) %>% 
  median()
rich_med_delta_nonCT
## [1] -0.08765917

Medians are now way closer to each other and we can see that distributions are closer to each other. For CT genes, the median is still a loss of 9.4618278%, however for nonCT the median loss has reduce to 8.7659168% ! The two distributions are now clearly comparable. It looks like the demethylation difference that we observed earlier between CT and nonCT genes in PMD during aging is now gone when we only analyse gene whose promoters are not poor in CpG.

Let’s do the same test as previously to see if there is a difference between both distributions (Mann-Withney) and if it isn’t due to the size of the sample.

wilcox.test(filter(CT_density_analysis, padj <= 0.05 & density_state != "poor")$delta_meth,
            filter(nonCT_density_analysis, padj <= 0.05 & density_state != "poor")$delta_meth,
            alternative = "greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  filter(CT_density_analysis, padj <= 0.05 & density_state != "poor")$delta_meth and filter(nonCT_density_analysis, padj <= 0.05 & density_state != "poor")$delta_meth
## W = 14362, p-value = 0.1347
## alternative hypothesis: true location shift is greater than 0
rich_CT_sign <- table(filter(CT_density_analysis, density_state != "poor")$padj <= 0.05)

distrib_means_rich <- unlist(bplapply(1:1000, random_delta_distrib_means,
                                data = filter(nonCT_density_analysis, density_state != "poor"), 
                                n_sign = rich_CT_sign[[2]], n_notsign = rich_CT_sign[[1]]))

mean_delta_rich <- mean(filter(CT_density_analysis, density_state != "poor")$delta_meth)
mean_delta_rich
## [1] -0.05581169
table(distrib_means_rich >= mean(filter(CT_density_analysis, 
                                        density_state != "poor")$delta_meth))
## 
## FALSE  TRUE 
##   735   265
hist(distrib_means_rich,
     col="dodgerblue1",
     border = "dodgerblue3",
     breaks = 20,
     main = paste0("Means of 1000 delta_meth distributions of nonCT genes when randomly 
     taking ", rich_CT_sign[[2]], " significative and ", rich_CT_sign[[1]], " not significative genes"),
     cex.main = 0.75,
     xlab = "Means",
     ylab = "Frequency",
     xlim = c(-0.15, 0))
abline(v = mean_delta_rich, col = "violetred1", lwd = 3)
text(mean_delta_rich, 125, "Mean of rich CT genes 
delta_meth distribution", col = "violetred3")

pvalue is not significative anymore which means that there is ano difference between both distribution.

Also, we can see when doing 1000 mean of delta_meth with the same amount of genes that the mean for CT genes is in the middle of the distribution, which means that there is no real difference between CT and nonCT demethylation when taking the CpG density into account

It thus looks like CpG density plays a big role in the protection as when we’re taking that into account in our analysis, the difference between CT and nonCT disappears.

When analysing genes with comlparable CpG density, there is no difference in the demethylation of CT and nonCT genes in PMD. This shows that the richness in CpG of the promoter gives a certain protection against PMD demethylation during aging.

4.4.2 solo-CpG methylation

In summary, although CT genes are found in PMDs, they lose little methylation and are protected from continued demethylation during aging. This protection seems to be explained by the CpG density of the promoters. The demethylation of CT genes visible in tumours is therefore not associated with cell proliferation. However, the question remains: why does a high CpG density protect against continued demethylation? Our hypothesis is that there may be a self-maintenance system, i.e. when a CpG loses its methylation in a CpG-rich region, the other still methylated CpG methylated CpGs attract the methylation maintenance machinery, thereby promoting the remethylation of the demethylated site. It should be noted that our observations are consistent with some analyses performed by Zhou et al. (2018) which showed that that the hypomethylation of PMDs was better observed by focusing only on the methylation of certain CpGs, called “solo-CpGs”, corresponding to isolated CpGs at least 35 bp away from a neighbouring CpG and flanked by a A or a T (WCGW).

We’re going to use Zhou’s coordinate of solo-WCGW to look a bit more into that.

solo_WCGW <- read_delim("data/zhou_solo_WCGW_hg38.bed", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)
solo_WCGW <- solo_WCGW[, -5]
colnames(solo_WCGW) <- c("seqnames", "start", "end", "context")
CT_with_solo <- CTmeth_inCD4 %>%
  filter(external_gene_name %in% filter(CT_genes, 
                                        PMD & regulated_by_methylation)$external_gene_name) %>%
  dplyr::select(external_gene_name, seqnames, start, age, methylation_level) %>% 
  left_join(dplyr::select(CT_genes, external_gene_name, ensembl_gene_id)) %>% 
  left_join(solo_WCGW) 
## Joining with `by = join_by(external_gene_name)`
## Joining with `by = join_by(seqnames, start)`
CT_with_solo <- CT_with_solo %>% 
  mutate(solo_state = case_when(is.na(CT_with_solo$context) ~ "not-solo",
                                !is.na(CT_with_solo$context) ~ "solo-CpG")) %>% 
  mutate(type = "CT_genes")



nonCT_with_solo <- nonCTmeth_inCD4 %>% 
  filter(ensembl_gene_id %in% nonCT_meth_means_thr$ensembl_gene_id) %>%
  dplyr::select(ensembl_gene_id, seqnames, start, age, methylation_level) %>%
  left_join(solo_WCGW)
## Joining with `by = join_by(seqnames, start)`
nonCT_with_solo <- nonCT_with_solo %>% 
  mutate(solo_state = case_when(is.na(nonCT_with_solo$context) ~ "not-solo",
                                !is.na(nonCT_with_solo$context) ~ "solo-CpG"))%>% 
  mutate(type = "nonCT_genes")

CT_with_solo %>% 
  dplyr::select(colnames(nonCT_with_solo)) %>% 
  rbind(nonCT_with_solo) %>% 
  
  ggplot(aes(y = methylation_level, x = age, 
             fill = solo_state)) +
  geom_jitter(aes(color = age), alpha = 0.2, size = 0.5) +
  geom_boxplot(outlier.shape = NA) +
  scale_fill_manual(values = c("lightskyblue1", "darkseagreen2")) +
  scale_color_manual(values = c("rosybrown3","palevioletred3","violetred4")) +
  ggtitle("CpG methylation in all ages, depending
on their solo state, in CT and nonCT genes") +
  facet_grid(type ~ solo_state) +
  theme_bw() 
## Warning: Removed 27881 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 27881 rows containing missing values (`geom_point()`).

What we see here confirms what zhou said and what we hypothesize, we can see that solo-cpg are losing more methylation following aging.

5 SessionInfo

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Biostrings_2.68.1           XVector_0.40.0             
##  [3] patchwork_1.1.2             BiocParallel_1.34.2        
##  [5] DOSE_3.26.1                 clusterProfiler_4.8.1      
##  [7] org.Hs.eg.db_3.17.0         AnnotationDbi_1.62.2       
##  [9] SingleCellExperiment_1.22.0 circlize_0.4.15            
## [11] ComplexHeatmap_2.16.0       UpSetR_1.4.0               
## [13] factoextra_1.0.7            SummarizedExperiment_1.30.2
## [15] Biobase_2.60.0              GenomicRanges_1.52.0       
## [17] GenomeInfoDb_1.36.1         IRanges_2.34.1             
## [19] S4Vectors_0.38.1            MatrixGenerics_1.12.3      
## [21] matrixStats_1.0.0           lubridate_1.9.2            
## [23] forcats_1.0.0               stringr_1.5.0              
## [25] dplyr_1.1.2                 purrr_1.0.2                
## [27] tidyr_1.3.0                 tibble_3.2.1               
## [29] ggplot2_3.4.2               tidyverse_2.0.0            
## [31] biomaRt_2.56.0              Vennerable_3.0             
## [33] xtable_1.8-4                gtools_3.9.4               
## [35] reshape_0.8.9               RColorBrewer_1.1-3         
## [37] lattice_0.21-8              RBGL_1.76.0                
## [39] graph_1.78.0                BiocGenerics_0.46.0        
## [41] CTexploreR_0.99.0           CTdata_1.0.2               
## [43] readr_2.1.4                 readxl_1.4.2               
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.0                 later_1.3.1                  
##   [3] bitops_1.0-7                  ggplotify_0.1.0              
##   [5] filelock_1.0.2                cellranger_1.1.0             
##   [7] polyclip_1.10-4               XML_3.99-0.14                
##   [9] lifecycle_1.0.3               doParallel_1.0.17            
##  [11] vroom_1.6.3                   MASS_7.3-60                  
##  [13] magrittr_2.0.3                sass_0.4.7                   
##  [15] rmarkdown_2.23                jquerylib_0.1.4              
##  [17] yaml_2.3.7                    httpuv_1.6.11                
##  [19] cowplot_1.1.1                 DBI_1.1.3                    
##  [21] abind_1.4-5                   zlibbioc_1.46.0              
##  [23] ggraph_2.1.0                  RCurl_1.98-1.12              
##  [25] yulab.utils_0.0.6             tweenr_2.0.2                 
##  [27] rappdirs_0.3.3                GenomeInfoDbData_1.2.10      
##  [29] enrichplot_1.20.0             ggrepel_0.9.3                
##  [31] tidytree_0.4.2                codetools_0.2-19             
##  [33] DelayedArray_0.26.7           xml2_1.3.4                   
##  [35] ggforce_0.4.1                 tidyselect_1.2.0             
##  [37] shape_1.4.6                   aplot_0.1.10                 
##  [39] farver_2.1.1                  viridis_0.6.3                
##  [41] BiocFileCache_2.8.0           jsonlite_1.8.7               
##  [43] GetoptLong_1.0.5              ellipsis_0.3.2               
##  [45] tidygraph_1.2.3               iterators_1.0.14             
##  [47] foreach_1.5.2                 tools_4.3.0                  
##  [49] progress_1.2.2                treeio_1.24.1                
##  [51] Rcpp_1.0.11                   glue_1.6.2                   
##  [53] gridExtra_2.3                 mgcv_1.8-42                  
##  [55] xfun_0.40                     qvalue_2.32.0                
##  [57] withr_2.5.0                   BiocManager_1.30.22          
##  [59] fastmap_1.1.1                 fansi_1.0.4                  
##  [61] digest_0.6.33                 gridGraphics_0.5-1           
##  [63] timechange_0.2.0              R6_2.5.1                     
##  [65] mime_0.12                     colorspace_2.1-0             
##  [67] Cairo_1.6-0                   GO.db_3.17.0                 
##  [69] RSQLite_2.3.1                 utf8_1.2.3                   
##  [71] generics_0.1.3                data.table_1.14.8            
##  [73] prettyunits_1.1.1             graphlayouts_1.0.0           
##  [75] httr_1.4.6                    S4Arrays_1.0.5               
##  [77] scatterpie_0.2.1              pkgconfig_2.0.3              
##  [79] gtable_0.3.3                  blob_1.2.4                   
##  [81] shadowtext_0.1.2              htmltools_0.5.6              
##  [83] fgsea_1.26.0                  clue_0.3-64                  
##  [85] scales_1.2.1                  png_0.1-8                    
##  [87] ggfun_0.0.9                   knitr_1.43                   
##  [89] rstudioapi_0.14               tzdb_0.4.0                   
##  [91] reshape2_1.4.4                rjson_0.2.21                 
##  [93] nlme_3.1-162                  curl_5.0.1                   
##  [95] cachem_1.0.8                  GlobalOptions_0.1.2          
##  [97] BiocVersion_3.17.1            parallel_4.3.0               
##  [99] HDO.db_0.99.1                 pillar_1.9.0                 
## [101] vctrs_0.6.3                   promises_1.2.1               
## [103] dbplyr_2.3.3                  cluster_2.1.4                
## [105] evaluate_0.21                 magick_2.7.4                 
## [107] cli_3.6.1                     compiler_4.3.0               
## [109] rlang_1.1.1                   crayon_1.5.2                 
## [111] labeling_0.4.2                plyr_1.8.8                   
## [113] stringi_1.7.12                viridisLite_0.4.2            
## [115] munsell_0.5.0                 lazyeval_0.2.2               
## [117] GOSemSim_2.26.0               Matrix_1.5-4.1               
## [119] ExperimentHub_2.8.1           hms_1.1.3                    
## [121] bit64_4.0.5                   KEGGREST_1.40.0              
## [123] shiny_1.7.4.1                 highr_0.10                   
## [125] interactiveDisplayBase_1.38.0 AnnotationHub_3.8.0          
## [127] igraph_1.4.3                  memoise_2.0.1                
## [129] bslib_0.5.0                   ggtree_3.8.0                 
## [131] fastmatch_1.1-3               bit_4.0.5                    
## [133] downloader_0.4                gson_0.1.0                   
## [135] ape_5.7-1